/**************************************************************************
* This program pension records form 5500 data for 1999
/*************************************************************************/

	/**************************************************************************
	* Step 1: Read the raw file
	***************************************************************************/

	data raw99   ;
		infile '/.../F5500/f5500/f_5500_1999.csv' delimiter = ',' MISSOVER DSD lrecl=/*redacted*/ firstobs=/*redacted*/ ;
		/*redacted*/
	run;

	/**************************************************************************
	* Step 2: clean the raw file and make variable names consistent
	***************************************************************************/

	data raw99
		(keep = DB
			DC
			cash_bal
			xDB
			plan_eff_year
			plan_qtr_strt
			plan_year_strt
			plan_qtr_end
			plan_year_end
			form_year
			filing_id
			filing_id_char
			adj_active
			collective_bargain_ind
			last_rpt_plan_num
			last_rpt_spons_ein
			opr_ein
			opr_pn
			partcp_account_bal_cnt
			pension_benefit_plan_ind
			tot_active_partcp_cnt
			type_pension_bnft_code
			type_plan_entity_ind
			t					/*this is used to de-duplicate*/
		);												
		set raw99;
		if /*redacted*/ then DC=/*redacted*/;	/*has DC characteristics*/
			else DC=/*redacted*/;
		if /*redacted*/ then DB=/*redacted*/;	/*has DB characteristics*/
			else DB=/*redacted*/;
		if /*redacted*/ then cash_bal=/*redacted*/;	/*is cash balance*/
			else cash_bal=/*redacted*/;
		if DB=/*redacted*/ & cash_bal=/*redacted*/ then xDB =/*redacted*/;				/*is DB but not cash_bal*/
			else xDB=/*redacted*/;
		plan_name = upcase(plan_name);
		if /*redacted*/ or 
		/*redacted*/ then supplemental=/*redacted*/;	/*is supplemental*/
			else supplemental=/*redacted*/;
		if supplemental=/*redacted*/ then delete;	/*drop supplemental plans*/
	
		form_plan_year_begin_date = left(trim(form_plan_year_begin_date));
		fpy_begin_yr = /*redacted*/; 	/*plan start year*/
		fpy_begin_mth = /*redacted*/; 	/*plan start month*/
		fpy_begin_day = /*redacted*/;	/*plan start day*/
		fpy_date = mdy(fpy_begin_mth,fpy_begin_day,fpy_begin_yr);		/*plan start sas date*/
		plan_qtr_strt = qtr(fpy_date);						/*plan calendar start qtr*/
		plan_year_strt = year(fpy_date);					/*plan calendar start year*/
	
		form_tax_prd = left(trim(form_tax_prd)); 
		ftp_yr = /*redacted*/; 				/*plan end year*/
		ftp_mth = /*redacted*/; 				/*plan end month*/
		ftp_day = /*redacted*/;				/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/
	
		form_year = /*redacted*/;						/*rename for consistency with research file*/
		filing_id_char = /*redacted*/;				/*string filing id*/
		adj_active = tot_active_partcp_cnt;				/*no editing for the raw file*/
		opr_ein = left(trim(spons_dfe_ein));				/*rename for consistency with research file*/
		opr_pn = left(trim(spons_dfe_pn));				/*rename for consistency with research file*/
		len_pn = length(opr_pn);					/*length of pn field*/
		if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
		else if len_pn=/*redacted*/ then opr_pn = /*redacted*/;
		/*Redacted: removing invalid EIN/PNs*/
		plan_eff_year = /*redacted*/;		/*year that the plan started*/
	
		type_plan_entity_ind = left(trim(type_plan_entity_ind));
		if type_plan_entity_ind^=/*redacted*/ then delete;			/*keep only single employer plans*/
	
		if DB=/*redacted*/ and DC=/*redacted*/ then delete; 					/*remove welfare plans*/
		pension_benefit_plan_ind = /*redacted*/;					/*all plans are now pension plans*/
		t = /*redacted*/;								/*helper used for de-duplication step*/
	run;

	/*Set variable lengths to enable error-free appending with the research file*/
	data raw99;
	
		/*redacted*/
		
		set raw99;
	run;

	/**************************************************************************
	* Step 3: De-duplicate the file (pick the max filing id for each EIN-PN)
	***************************************************************************/
	
	proc sort data = raw99;
		by opr_ein opr_pn plan_year_end;
	run;
	
	proc means noprint data = raw99;
		by opr_ein opr_pn plan_year_end;
		var filing_id;
		output out = best_filing 
		max(filing_id) = max_filing_id;
	run;
	
	proc sql;
		create table clean99 as
		select a.*, b.max_filing_id
		from raw99 as a inner join
		best_filing as b
		on a.filing_id = b.max_filing_id;
	quit;
	
	data  clean99;
		set clean99 (drop= t max_filing_id);
		source_id = /*redacted*/; /*id for raw file*/
	run;
	
	/*Remove exact filing_id dups*/
	proc sort data = clean99 nodupkey;
	      by opr_ein opr_pn plan_year_end filing_id_char;
	run;

	/**************************************************************************
	* Step 4: Count up # DB plans per EIN
	***************************************************************************/
	
	proc sort data = clean99;
		by opr_ein;
	run;
	
	proc means noprint data = clean99;
		by opr_ein;
		var DB;				
		output out = plan_count99
			sum(DB) = num_DB_plans;
	run;
	
	/*Check how many participants are in firms with 2+ DB plans*/
	proc sql;
		create table pens99_plncnt as 
		select *
		from clean99 as t,
		plan_count99 as m
		where t.opr_ein = m.opr_ein;
	quit;
		
	data blue.pensplan_samp99;
		set pens99_plncnt
		(drop = _type_ _freq_ xDB );
		adj_DB_partcp = DB*adj_active;
		DB_partcp = DB*tot_active_partcp_cnt;
	run;
	
	/*De-dup small number of cases where filing errors create EIN-PN_Plan year end dups
	  i.e. the filing_id's are different but the plan years are mis-aligned*/
	proc sort data = blue.pensplan_samp99 nodupkey;
	  by opr_ein opr_pn plan_year_end;
	run;
	
	proc freq data = blue.pensplan_samp99;
	  tables DB*DC;
	run;	
	
	/**************************************************************************
	* Step 5: Add data from schedule H (employer and employee contributions)
	***************************************************************************/
	
	/*Input schedule H raw data*/
	PROC IMPORT OUT= WORK.schh1999 
		    DATAFILE= "/.../F5500/f5500/F_SCH_H_1999.csv" 
		    DBMS=CSV REPLACE;
	    GETNAMES=YES;
	    DATAROW=/*redacted*/; 
	RUN;

	data schh1999  (keep = filing_id_char opr_ein opr_pn emplr_contrib_income_amt_n employee_contrib_income_amt_n tot_income_amt_n plan_year_end);
		set schh1999 (rename = (emplr_contrib_income_amt=emplr_contrib_income_amt_n 
		participant_contrib_amt=employee_contrib_income_amt_n
		tot_income_amt = tot_income_amt_n));
		filing_id_char = /*redacted*/;
		opr_ein = left(trim(sch_h_ein));
		opr_pn = left(trim(sch_h_pn));
		schh_tax_prd = left(trim(sch_h_tax_prd)); 
		ftp_yr = /*redacted*/; 				/*plan end year*/
		ftp_mth = /*redacted*/; 				/*plan end month*/
		ftp_day = /*redacted*/;				/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/
	run;

	/*Input schedule I raw data*/
	PROC IMPORT OUT= WORK.schi1999
		    DATAFILE= "/.../F5500/f5500/F_SCH_I_1999.csv" 
		    DBMS=CSV REPLACE;
	    GETNAMES=YES;
	    DATAROW=/*redacted*/; 
	RUN;

	data schi1999 (keep = filing_id_char opr_ein opr_pn emplr_contrib_income_amt_n employee_contrib_income_amt_n tot_income_amt_n plan_year_end);
		set schi1999 (rename = (small_emplr_contrib_income_amt=emplr_contrib_income_amt_n 
		small_participant_contrib_amt=employee_contrib_income_amt_n
		small_tot_income_amt = tot_income_amt_n));
		filing_id_char = /*redacted*/;
		opr_ein = left(trim(sch_i_ein));
		opr_pn = left(trim(sch_i_plan_num));
		schi_tax_prd = left(trim(sch_i_tax_prd)); 
		ftp_yr = /*redacted*/; 				/*plan end year*/
		ftp_mth = /*redacted*/; 				/*plan end month*/
		ftp_day = /*redacted*/;				/*plan end day*/
		ftp_date = mdy(ftp_mth,ftp_day,ftp_yr);					/*plan end sas date*/
		plan_qtr_end = qtr(ftp_date);						/*plan calendar end qtr*/
		plan_year_end = year(ftp_date);						/*plan calendar end year*/		
	run;

	/*Append the files*/
	data blue.schhi1999;
	      set schh1999 schi1999;
	run;
	
	/*De-duplicate H and I schedules*/
	proc sort data = blue.schhi1999 nodupkey;
	      by filing_id_char;
	run;


	/*Clean up the working directory*/
	proc datasets lib=work nolist kill;
	quit;
	run;
		
